# load data sets
full_data <- tibble()
names <- c(ws = 'windSpeed', wd = 'windDirection', H2S = 'Value', QC.Code = 'qcCode',
           Monitor = 'siteName', MinDist = 'NEAR_DIST')
for (file in list.files('data/raw_csv')){
  new_data <- read.csv(paste0('data/raw_csv/', file)) %>% rename(any_of(names))
  print(sum(is.na(new_data$H2S)))
  full_data <- bind_rows(full_data, new_data)
}
## [1] 112
## [1] 0
## [1] 9398
## [1] 9771
## [1] 32036
## [1] 11599
## [1] 12440
## [1] 11914
## [1] 18019
## [1] 11424
## [1] 0
## [1] 23974
## [1] 0
remove(new_data)
gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3638185 194.4    6079797  324.7   4065484  217.2
## Vcells 102610639 782.9  223961814 1708.7 199248175 1520.2

Compute date variables

full_data <- full_data %>% 
  mutate(DateTime = case_when(Monitor %in% c('North HS', 'West HS', 'Elm Ave') ~ parse_date_time(DateTime, 'ymd HMS'),
                              .default = parse_date_time(DateTime, 'mdy HM'))) %>%
  mutate(DateTime = as.POSIXct(DateTime, "%Y-%m-%d %H:%M:%OS", tz = 'UTC'))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `DateTime = case_when(...)`.
## Caused by warning:
## !  2031728 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
full_data <- full_data %>% 
  mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC'))
 
# Remove observations before 2020
full_data <- full_data %>% filter(day > '2019-12-31') %>% distinct()
 
full_data <- full_data %>% 
  mutate(yearmonth = format(day, "%Y-%m"),
         year = format(day, "%Y"),
         month = format(day, "%m"),
         weekday = relevel(factor(wday(full_data$day, label=TRUE), ordered = FALSE), ref = "Sun"))

Add wind data for El Segundo and St Anthony

klax_data <- read_csv('data/KLAX_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)

klax_daily <- klax_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
            precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]],
            ws = mean(as.numeric(wind_speed), na.rm=TRUE),
            wd = as.numeric(mean(circular(as.numeric(wind_direction), units = 'degrees'), na.rm=TRUE))) %>%
  mutate(precip = coalesce(as.numeric(precip), 0),
         Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

full_data <- rbind(full_data %>% 
                     filter(!(Monitor %in% c('El Segundo', 'St. Anthony'))), 
                   full_data %>%
                     select(-c(wd, ws)) %>%
                     filter(Monitor %in% c('El Segundo', 'St. Anthony')) %>%
                     left_join(klax_daily %>% select(Date, ws, wd), join_by(day == Date))) 
# Remove rows without wd or ws
full_data <- full_data %>% 
  drop_na(c("wd", "ws"))
# Check for duplicate time
# full_data %>%
#    group_by(DateTime, Monitor) %>%
#    summarise(n = n()) %>%
#    filter(n > 1)

We observe multiple rows for the same date time and monitor. From a quick glance, this could be a conversion issue from Date.Time to DateTime, since the H2S measurements are different. In other occasions, multiple rows exist as complements, where one row contains data on the missing values of the other. It also appears that some times we get different measurements for the same time when it’s not daylight saving conversion days?

# full_data %>%
#     group_by(DateTime, Monitor) %>%
#     summarise(n = n()) %>%
#     filter(n > 1) %>%
#     mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC')) %>%
#     group_by(day, Monitor) %>%
#     summarise(n = n())
# Deal with duplicate rows for same time

# For the 213 monitor it seems like the duplicate rows are complements to the other
#https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row
# coalesce_by_column <- function(df) {
#   return(dplyr::coalesce(!!! as.list(df)))
# }
# 
# data_213 <- full_data %>%
#   filter(Monitor == '213th & Chico') %>%
#   group_by(DateTime) %>%
#   summarise_all(coalesce_by_column)
# 
# full_data <- full_data %>% 
#   filter(Monitor != '213th & Chico') %>%
#   rbind(data_213)
glimpse(full_data)
## Rows: 3,315,509
## Columns: 32
## $ DateTime           <dttm> 2021-10-22 15:15:00, 2021-10-22 15:20:00, 2021-10-…
## $ Date.Time          <chr> "10/22/2021 15:15", "10/22/2021 15:20", "10/22/2021…
## $ DaylightSavings    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ H2S                <dbl> 138.79, 183.26, 149.24, 263.18, 308.97, 169.06, 234…
## $ Unit               <chr> "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "p…
## $ QC.Code            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MDL                <dbl> 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0…
## $ Averaging.Hour     <chr> "5 Min", "5 Min", "5 Min", "5 Min", "5 Min", "5 Min…
## $ Monitor            <chr> "213th & Chico", "213th & Chico", "213th & Chico", …
## $ latitude           <dbl> 33.83678, 33.83678, 33.83678, 33.83678, 33.83678, 3…
## $ longitude          <dbl> -118.2586, -118.2586, -118.2586, -118.2586, -118.25…
## $ ws                 <dbl> 7.01, 6.32, 6.73, 6.54, 8.02, 7.82, 6.99, 6.71, 6.3…
## $ wd                 <dbl> 276.87, 250.15, 263.97, 280.36, 269.26, 256.21, 264…
## $ wd_QCcode          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ws_QCcode          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Join_Count         <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ MinDist            <dbl> 2873.053, 2873.053, 2873.053, 2873.053, 2873.053, 2…
## $ Converted_Angle    <int> 306, 306, 306, 306, 306, 306, 306, 306, 306, 306, 3…
## $ Refinery           <chr> "Marathon (Carson)", "Marathon (Carson)", "Marathon…
## $ siteId             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ unitName           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ qcName             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opName             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windSpeed_Unit     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windDirection_Unit <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opCode             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Source             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ day                <dttm> 2021-10-22, 2021-10-22, 2021-10-22, 2021-10-22, 20…
## $ yearmonth          <chr> "2021-10", "2021-10", "2021-10", "2021-10", "2021-1…
## $ year               <chr> "2021", "2021", "2021", "2021", "2021", "2021", "20…
## $ month              <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10…
## $ weekday            <fct> Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, F…
summary(full_data)
##     DateTime                       Date.Time         DaylightSavings  
##  Min.   :2020-01-01 00:00:00.00   Length:3315509     Min.   :0        
##  1st Qu.:2021-04-29 07:15:00.00   Class :character   1st Qu.:0        
##  Median :2021-12-13 21:30:00.00   Mode  :character   Median :0        
##  Mean   :2021-12-08 01:00:39.75                      Mean   :0        
##  3rd Qu.:2022-08-21 17:15:00.00                      3rd Qu.:0        
##  Max.   :2023-05-21 00:55:00.00                      Max.   :1        
##                                                      NA's   :1064847  
##       H2S              Unit              QC.Code            MDL         
##  Min.   :   0.00   Length:3315509     Min.   :  0      Min.   :0.4      
##  1st Qu.:   0.22   Class :character   1st Qu.:  0      1st Qu.:0.4      
##  Median :   0.60   Mode  :character   Median :  0      Median :0.4      
##  Mean   :   1.13                      Mean   :  0      Mean   :0.4      
##  3rd Qu.:   1.11                      3rd Qu.:  0      3rd Qu.:0.4      
##  Max.   :3817.24                      Max.   :121      Max.   :0.4      
##  NA's   :102956                       NA's   :461526   NA's   :1167803  
##  Averaging.Hour       Monitor             latitude       longitude     
##  Length:3315509     Length:3315509     Min.   :33.78   Min.   :-118.4  
##  Class :character   Class :character   1st Qu.:33.79   1st Qu.:-118.3  
##  Mode  :character   Mode  :character   Median :33.82   Median :-118.3  
##                                        Mean   :33.84   Mean   :-118.3  
##                                        3rd Qu.:33.86   3rd Qu.:-118.3  
##                                        Max.   :33.92   Max.   :-118.2  
##                                        NA's   :301     NA's   :301     
##        ws               wd           wd_QCcode         ws_QCcode      
##  Min.   : 0.000   Min.   :-179.5   Min.   :0         Min.   :0        
##  1st Qu.: 1.780   1st Qu.: 101.7   1st Qu.:0         1st Qu.:0        
##  Median : 3.610   Median : 213.2   Median :0         Median :0        
##  Mean   : 4.164   Mean   : 182.1   Mean   :0         Mean   :0        
##  3rd Qu.: 6.147   3rd Qu.: 282.0   3rd Qu.:0         3rd Qu.:0        
##  Max.   :57.780   Max.   : 360.0   Max.   :7         Max.   :8        
##                                    NA's   :1064847   NA's   :1064847  
##    Join_Count       MinDist       Converted_Angle   Refinery        
##  Min.   :1.000   Min.   : 758.7   Min.   : 11.0   Length:3315509    
##  1st Qu.:1.000   1st Qu.:1308.3   1st Qu.:189.0   Class :character  
##  Median :1.000   Median :1988.2   Median :209.0   Mode  :character  
##  Mean   :1.912   Mean   :1948.0   Mean   :205.5                     
##  3rd Qu.:3.000   3rd Qu.:2541.9   3rd Qu.:263.0                     
##  Max.   :5.000   Max.   :3592.8   Max.   :338.0                     
##                                                                     
##     siteId            unitName            qcName             opName         
##  Length:3315509     Length:3315509     Length:3315509     Length:3315509    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  windSpeed_Unit     windDirection_Unit     opCode           Source         
##  Length:3315509     Length:3315509     Min.   :0         Length:3315509    
##  Class :character   Class :character   1st Qu.:0         Class :character  
##  Mode  :character   Mode  :character   Median :0         Mode  :character  
##                                        Mean   :0                           
##                                        3rd Qu.:0                           
##                                        Max.   :0                           
##                                        NA's   :2609232                     
##       day                          yearmonth             year          
##  Min.   :2020-01-01 00:00:00.00   Length:3315509     Length:3315509    
##  1st Qu.:2021-04-29 00:00:00.00   Class :character   Class :character  
##  Median :2021-12-13 00:00:00.00   Mode  :character   Mode  :character  
##  Mean   :2021-12-07 13:03:15.26                                        
##  3rd Qu.:2022-08-21 00:00:00.00                                        
##  Max.   :2023-05-21 00:00:00.00                                        
##                                                                        
##     month           weekday     
##  Length:3315509     Sun:471574  
##  Class :character   Mon:466861  
##  Mode  :character   Tue:472058  
##                     Wed:475719  
##                     Thu:476384  
##                     Fri:476201  
##                     Sat:476712

Some concerns for the data include: For TorranceAir_H2SWind, there are actually three sites, but only one site contains ws/wd information What’s the difference between DateTime and Date/Time? DateTime would be the time after converting What’s the difference between Date/Time and Date/Time.First? The latter is present in FirstMethodist For missing H2S, it could be missing due to mismatch from merging, or flagged due to issues such as low detection (minimm detection limit: 0.4)

Any additional cleaning

# For Judson, it appeared that longitude had some significant figure issues
full_data <- full_data %>%
  mutate(longitude = case_when(Monitor == 'Judson' ~ -118.268128,
                               Monitor == 'West HS' ~ -118.36836539131416,
                               Monitor == 'Elm Ave' ~ -118.3316298600153,
                               Monitor == 'North HS' ~ -118.33660803949037,
                               Monitor == 'G Street' ~ -118.2818168,
                               Monitor == 'Guenser Park' ~ -118.313499,
                               Monitor == 'Harbor Park' ~ -118.286028,
                               .default = longitude),
         latitude = case_when(Monitor == 'West HS' ~ 33.85032033929173, 
                               Monitor == 'Elm Ave' ~ 33.84048285612423,
                               Monitor == 'North HS' ~ 33.86785391284254, 
                               Monitor == 'G Street' ~ 33.77785794,
                               Monitor == 'Guenser Park' ~ 33.869231,
                               Monitor == 'Harbor Park' ~ 33.786252,
                               .default = latitude))
# For observations below MDL, set it to half of MDL
# As confirmed, MDL for all monitors is 0.4
# Check how many observations are below MDL
full_data %>%
  group_by() %>%
  summarise('Below MDL' = sum(full_data$H2S < 0.4, na.rm=T),
            'Proportion' = sum(full_data$H2S < 0.4, na.rm=T) / n())
full_data <- full_data %>%
  mutate(H2S = if_else(H2S < 0.4, 0.2, H2S))

Compute Daily Max H2S

# Compute daily max
# -Inf (all NA) is converted to NA for that day
H2S_dm <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
  mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 171 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1660: `day = 2021-01-19`, `Monitor = "G Street"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 170 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
  left_join(H2S_dm, by = c('day', 'Monitor'))

Compute Monthly Avearge H2S

H2S_ma <- H2S_dm %>%
  mutate(yearmonth = format(day, "%Y-%m")) %>%
  group_by(yearmonth, Monitor) %>%
  summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
full_data <- full_data %>%
  left_join(H2S_ma, by = c('yearmonth', 'Monitor'))

remove(H2S_ma, H2S_dm)

Compute daily average H2S

# Compute daily average
H2S_da <- full_data %>%
  group_by(day, Monitor) %>%
  summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
  left_join(H2S_da, by = c('day', 'Monitor'))

remove(H2S_da)

Compute wind sector

full_data <- full_data %>%
  mutate(wd_sec = case_when(0 <= wd & wd < 90 ~ 'Q1',
                            90 <= wd & wd < 180 ~ 'Q2',
                            180 <= wd & wd < 270 ~ 'Q3',
                            270 <= wd & wd <= 360 ~ 'Q4'))

Water treatment plants

la_wrp <- read_sf('shapefiles/LA_WRP.shp', layer = 'LA_WRP')

CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")

# Convert both water treatment plants and monitor coordinates to UTM
la_wrp <- st_transform(la_wrp, CRS_UTM)

monitors <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct() %>% 
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326) %>% 
  st_transform(CRS_UTM)

# get nearest wrp for each monitor
monitor_wrp <- monitors %>% 
  mutate(wrp_name = la_wrp$CWP_NAM[st_nearest_feature(monitors, la_wrp)],
         dist_wrp = apply(st_distance(monitors, la_wrp), 1, min),
         capacity = la_wrp$cpcty_m[st_nearest_feature(monitors, la_wrp)])

# we also want to find the angle between the monitor and wrp
angles <- tibble(monitors$geometry, la_wrp$geometry[st_nearest_feature(monitors, la_wrp)]) %>%
  pivot_longer(cols = everything()) %>% 
  pull(value) %>% # extract coordinates only
  st_transform(4326) %>% # convert to lat/lon for function below
  st_geod_azimuth() %>%
  set_units('degrees') %>% # convert to degrees
  drop_units()

angles <- angles[c(T, F)] # keep only odd index, valid pairs
angles <- if_else(angles < 0, angles + 360, angles)

monitor_wrp$wrp_angle <- angles

full_data <- full_data %>%
  left_join(tibble(monitor_wrp) %>% select(-geometry), join_by('Monitor'))

Dominguez Channel

# Read in the shape file, it already has a CRS
st_read('shapefiles/DominguezChannel_Carson.shp')
## Reading layer `DominguezChannel_Carson' from data source 
##   `D:\313\Year4Uni\Reading Course\H2S-study\shapefiles\DominguezChannel_Carson.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 9 features and 17 fields
## Geometry type: LINESTRING
## Dimension:     XYZ
## Bounding box:  xmin: -118.3382 ymin: 33.77701 xmax: -118.2275 ymax: 33.92881
## z_range:       zmin: 0 zmax: 0
## Geodetic CRS:  WGS 84
d_channel <- read_sf('shapefiles/DominguezChannel_Carson.shp', layer = 'DominguezChannel_Carson')
d_channel <- st_transform(d_channel, CRS_UTM) # convert to UTM crs

monitor_d_channel <- monitors %>%
  mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min),
         dist_213 = c(drop_units(st_distance(monitors, 
                                             monitors[monitors$Monitor == '213th & Chico',]))))

full_data <- full_data %>%
  left_join(tibble(monitor_d_channel) %>% select(-geometry), join_by(Monitor))
dist_plot <- d_channel %>% 
  ggplot() +
  geom_sf() +
  geom_sf(data = monitors) +
  geom_sf_label(data = monitors %>% mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min)), 
                aes(label = paste0(Monitor, ' ', round(dist_dc, 2)))) +
  theme_minimal()

dist_plot

remove(dist_plot, la_wrp, monitor_wrp, monitor_d_channel, d_channel)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4006932  214.0    6079797  324.7   6079797  324.7
## Vcells 324227355 2473.7  560834762 4278.9 560830006 4278.8

Weather data from National Weather Service

# klax_data <- read_csv('data/KLAX_2020-2023.csv', show_col_types = FALSE) %>% 
#   filter(!row_number() == 1)
ktoa_data <- read_csv('data/KTOA_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)
klgb_data <- read_csv('data/KLGB_2020-2023.csv', show_col_types = FALSE) %>% 
  filter(!row_number() == 1)
# compute daily average temperature, humidity, and precipitation
# klax_daily <- klax_data %>%
#   group_by(Date) %>%
#   summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
#             avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
#             precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
#   mutate(precip = coalesce(as.numeric(precip), 0),
#          Date = parse_date_time(Date, c('m/d/y')),
#          Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

klax_daily <- klax_daily %>% 
  select(-c(ws, wd))

klgb_daily <- klgb_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
            precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
  mutate(precip = coalesce(as.numeric(precip), 0),
         Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

ktoa_daily <- ktoa_data %>%
  group_by(Date) %>%
  summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
            avg_hum = mean(as.numeric(relative_humidity), na.rm=T)) %>%
  mutate(Date = parse_date_time(Date, c('m/d/y')),
         Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))

remove(klax_data, ktoa_data, klgb_data)

# notice that the Torrance station is missing 10 days of observations
# for that 10 days, use the long beach airport data as substitutes
ktoa_daily <- bind_rows(ktoa_daily, 
                        anti_join(klgb_daily,ktoa_daily, by = 'Date') %>% select(-precip))
# map each monitor to closest monitor
weather_stations <- tibble(station = c('KLAX', 'KLGB', 'KTOA'),
                           latitude = c(33.93806, 33.81167, 33.803979),
                           longitude = c(-118.33194, -118.38889, -118.339432)) %>%
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326)

weather_stations <- st_transform(weather_stations, CRS_UTM)

closest_temp_hum_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations)]

closest_precip_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations %>% filter(station != 'KTOA'))]

monitors_weather <- tibble(Monitor = monitors$Monitor, 
                           weather_station = closest_temp_hum_station,
                           closest_precip_station = closest_precip_station)

weather_full <- bind_rows(klax_daily %>% select(-precip) %>% 
                            mutate(weather_station = 'KLAX'),
                          klgb_daily %>% select(-precip) %>% 
                            mutate(weather_station = 'KLGB'),
                          ktoa_daily %>% 
                            mutate(weather_station = 'KTOA'))

precip_full <- bind_rows(klax_daily %>% select(Date, precip) %>% 
                            mutate(closest_precip_station = 'KLAX'),
                          klgb_daily %>% select(Date, precip) %>% 
                            mutate(closest_precip_station = 'KLGB'))

full_data <- full_data %>%
  left_join(monitors_weather, join_by('Monitor')) %>%
  left_join(weather_full, join_by('day' == 'Date', 'weather_station')) %>%
  left_join(precip_full, join_by('day' == 'Date', 'closest_precip_station'))

remove(klax_daily, ktoa_daily, klgb_daily, precip_full, weather_full, 
       monitors_weather, weather_stations)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   3965467  211.8    6079797  324.7   6079797  324.7
## Vcells 340334119 2596.6  812609506 6199.8 676897829 5164.4

Well production data

well_prod <- read_csv('data/LA well prod Dec2023 Well Monthly Production.CSV', show_col_types = FALSE)
full_well_info <- read_csv('data/well_active_inactive_prod_since_2000.CSV', show_col_types = FALSE)
active_well_info <- read_csv('data/LA well prod Dec2023 Well Headers.CSV', show_col_types = FALSE)
inactive_well_info <- anti_join(full_well_info, active_well_info, join_by(API14)) %>%
  rename(lon = `Surface Hole Longitude (WGS84)`,
         lat = `Surface Hole Latitude (WGS84)`) %>%
  select(`API14`, lon, lat) %>%
  distinct() %>%
  st_as_sf(coords = c('lon', 'lat')) %>% 
  st_set_crs(4326) %>%
  st_transform(CRS_UTM) 

well_prod <- well_prod %>%
  left_join(active_well_info, join_by(`API/UWI` == API14)) 

well_prod <- well_prod %>%
  select(`API/UWI`, `Monthly Production Date`, `Monthly Oil`, `Monthly Gas`, 
         `Monthly Water`, `Monthly BOE`, Days, `Operator Company Name.x`, 
         `Production Type.x`, `Well Status.x`, `Drill Type`, 
         `Surface Hole Latitude (WGS84)`, `Surface Hole Longitude (WGS84)`) %>%
  rename(lon = `Surface Hole Longitude (WGS84)`,
         lat = `Surface Hole Latitude (WGS84)`)

# project distinct well locations to UTM
active_well_location <- well_prod %>%
  select(`API/UWI`, lon, lat) %>%
  distinct() %>%
  st_as_sf(coords = c('lon', 'lat')) %>% 
  st_set_crs(4326) %>%
  st_transform(CRS_UTM) 

# distance to closest monitor for each well
monitor_active_wells <- tibble()
for (i in 1:nrow(monitors)){
  monitor_well_api <- drop_units(st_distance(monitors[i, ], active_well_location))
  monitor_well_api <- active_well_location[which(monitor_well_api <= 2000), ] %>% st_drop_geometry()
  monitor_active_wells <- rbind(monitor_active_wells, 
                                tibble(Monitor = monitors[i, 1]$Monitor, 
                                       API14 = monitor_well_api$`API/UWI`))
}

monitor_well_prod <- monitor_active_wells %>% 
  left_join(well_prod, join_by(API14 == `API/UWI`)) %>%
  group_by(Monitor, `Monthly Production Date`) %>%
  summarise(active_2km = n(),
            monthly_oil_2km = sum(`Monthly Oil`, na.rm = TRUE),
            monthly_gas_2km = sum(`Monthly Gas`, na.rm = TRUE)) %>%
  mutate(yearmonth = format(`Monthly Production Date`, '%Y-%m')) %>%
  select(-`Monthly Production Date`)
## Warning in left_join(., well_prod, join_by(API14 == `API/UWI`)): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 18032 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Monitor'. You can override using the
## `.groups` argument.
full_data <- full_data %>%
  left_join(monitor_well_prod, join_by(Monitor, yearmonth)) %>%
  mutate(active_2km = coalesce(active_2km, 0),
            monthly_oil_2km = coalesce(monthly_oil_2km, 0),
            monthly_gas_2km = coalesce(monthly_gas_2km, 0))

monitor_inactive_wells <- tibble()
for (i in 1:nrow(monitors)){
  monitor_well_api <- drop_units(st_distance(monitors[i, ], inactive_well_info))
  monitor_well_api <- inactive_well_info[which(monitor_well_api <= 2000), ] %>% st_drop_geometry()
  monitor_inactive_wells <- rbind(monitor_inactive_wells, 
                                  tibble(Monitor = monitors[i, 1]$Monitor, 
                                         inactive_2km = nrow(monitor_well_api)))
}

full_data <- full_data %>%
  left_join(monitor_inactive_wells, join_by(Monitor))
# get monthly production data at different distance levels (monitor to well)
# prod_data <- list(
#     well_prod %>% 
#       filter(`1km` == TRUE) %>% 
#       group_by(`Monthly Production Date`) %>% 
#       summarise(active_1km = n(),
#                 monthly_oil_1km = sum(`Monthly Oil`, na.rm = TRUE),
#                 monthly_gas_1km = sum(`Monthly Gas`, na.rm = TRUE),
#                 monthly_water_1km = sum(`Monthly Water`, na.rm = TRUE),
#                 monthly_boe_1km = sum(`Monthly BOE`, na.rm = TRUE))
#     ,
#     well_prod %>% 
#       filter(`2p5km` == TRUE) %>% 
#       group_by(`Monthly Production Date`) %>% 
#       summarise(active_2p5km = n(),
#             monthly_oil_2p5km = sum(`Monthly Oil`, na.rm = TRUE),
#             monthly_gas_2p5km = sum(`Monthly Gas`, na.rm = TRUE),
#             monthly_water_2p5km = sum(`Monthly Water`, na.rm = TRUE),
#             monthly_boe_2p5km = sum(`Monthly BOE`, na.rm = TRUE))
#     ,
#     well_prod %>% 
#       filter(`5km` == TRUE) %>% 
#       group_by(`Monthly Production Date`) %>% 
#       summarise(active_5km = n(),
#             monthly_oil_5km = sum(`Monthly Oil`, na.rm = TRUE),
#             monthly_gas_5km = sum(`Monthly Gas`, na.rm = TRUE),
#             monthly_water_5km = sum(`Monthly Water`, na.rm = TRUE),
#             monthly_boe_5km = sum(`Monthly BOE`, na.rm = TRUE))
#   ) %>%
#   reduce(inner_join, by = 'Monthly Production Date') %>%
#   mutate(yearmonth = format(`Monthly Production Date`, '%Y-%m'))
# 
# full_data <- full_data %>%
#   left_join(prod_data, join_by(yearmonth))
# cleanup columns
full_data <- full_data  %>%
  select(-c('Date.Time', 'MDL', 'Unit', 'Averaging.Hour', 'Join_Count', 'siteId', 'unitName', 
            'qcName', 'opName', 'windSpeed_Unit', 'windDirection_Unit', 'opCode', 'Source',
            'DaylightSavings', 'QC.Code', 'wd_QCcode', 'ws_QCcode'))

Downwind Indicator wrp Refinery

# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)

Downwind of wrp

# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

full_data$downwind_wrp <- as.integer(wind_diff <= 15)

Compute UTM coordinates for monitors

CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")

monitors <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct() %>% 
  st_as_sf(coords = c('longitude', 'latitude')) %>% 
  st_set_crs(4326) %>% 
  st_transform(CRS_UTM)
monitors <- tibble(Monitor = monitors$Monitor,
                   mon_utm_x = st_coordinates(monitors)[,1],
                   mon_utm_y = st_coordinates(monitors)[,2])

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))
# Compute daily average wd/ws
daily_full <- full_data %>%
  select(H2S_daily_max, H2S_daily_avg, H2S_monthly_average, ws, wd, latitude, 
         longitude, mon_utm_x, mon_utm_y, Monitor, MinDist, 
         Converted_Angle, Refinery, year, month, day, weekday, 
         active_2km, monthly_oil_2km, monthly_gas_2km, inactive_2km,
         dist_wrp, capacity, wrp_angle, dist_dc, dist_213,
         avg_temp, avg_hum, precip) %>%
  group_by(Monitor, day) %>%
  mutate(ws_avg = mean(ws, na.rm=TRUE),
         wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
  mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
  ungroup() %>%
  rename(monitor_lat = latitude, monitor_lon = longitude)  %>%
  select(-c(wd, ws)) %>%
  distinct()
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- as.integer(wind_diff <= 15)

wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)

daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)

Get monitor locations

monitors_coord <- full_data %>%
  select(Monitor, latitude, longitude) %>%
  distinct()

monitors <- monitors_coord %>%
  select(Monitor)
  
coordinates(monitors_coord) <- ~ longitude + latitude

Load in elevation data

elevation <- raster('shapefiles/N33W119.hgt')

monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
  as.data.frame() %>%
  rename(elevation = 2)

Load in EVI data

#evi <- raster('shapefiles/MOD13Q1.061__250m_16_days_EVI_doy2023145_aid0001.tif')
evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')
monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
  as.data.frame() %>%
  rename(EVI = 3)

Merge EVI and elevation to data

full_data <- full_data %>%
  left_join(monitors, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors, join_by(Monitor))

Merge Odor Complaint data

odor <- read_csv('data/odorcomplaintdata_2018_2023.csv')
## New names:
## Rows: 22926 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, zip, num_odor_complaints date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
la_county <- read_sf('shapefiles/Zip_Codes_(LA_County)/Zip_Codes_(LA_County).shp')
# https://gis.stackexchange.com/questions/282750/identify-polygon-containing-point-with-r-sf-package
pnts_sf <- do.call("st_sfc",c(lapply(1:nrow(monitors_coord@coords), 
function(i) {st_point(as.numeric(monitors_coord@coords[i, ]))}), list("crs" = 4326))) 

pnts_trans <- st_transform(pnts_sf, CRS_UTM) # apply transformation to pnts sf
la_county_trans <- st_transform(la_county, CRS_UTM)      # apply transformation to polygons sf

# intersect and extract state name
monitors_coord@data$county <- apply(st_intersects(la_county_trans, pnts_trans, sparse = FALSE), 2, 
               function(col) { 
                  la_county_trans[which(col), ]$ZIPCODE
               })
library(ggrepel)
# Plot results to double check
la_county_present <- la_county[la_county$ZIPCODE %in% unique(monitors_coord$county), ]

monitor_zip <- tibble(Monitor = monitors_coord@data$Monitor,
                      zip = monitors_coord@data$county,
                      longitude = monitors_coord@coords[,1],
                      latitude = monitors_coord@coords[,2])

monitor_zip_graph <- ggplot() +
  geom_sf(data = la_county_present, aes(fill = ZIPCODE)) +
  geom_point(data = monitor_zip, 
             aes(x = longitude, y = latitude))

monitor_zip_graph + 
  geom_label_repel(data = monitor_zip, aes(x = longitude, y = latitude, label = Monitor),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_minimal()

odor <- odor[odor$zip %in% unique(monitors_coord$county), ]

monitor_odor <- monitors_coord@data %>%
  left_join(odor %>% mutate(zip = as.character(zip)), join_by(county == zip)) %>%
  select(-`...1`)
## Warning in left_join(., odor %>% mutate(zip = as.character(zip)), join_by(county == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 10 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
full_data <- full_data %>%
  left_join(monitors_coord@data, join_by(Monitor))

daily_full <- daily_full %>%
  left_join(monitors_coord@data, join_by(Monitor))

full_data <- full_data %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

full_data <- full_data %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))

daily_full <- daily_full %>%
  left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))

daily_full <- daily_full %>%
  mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
odor_graph <- odor %>%
  filter(zip %in% unique(monitors_coord$county)) %>%
  mutate(zip = as.character(zip)) %>%
   ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
   geom_line() +
   labs(title = "Number of odor complaints over time", y = 'Number of odor complaints', x = 'time') +
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(odor_graph) %>% layout(dragmode = 'pan')
odor_graph_b <- odor %>%
  filter(zip %in% unique(monitors_coord$county)) %>%
  mutate(zip = as.character(zip)) %>%
   ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
   geom_line() +
   labs(title = "Number of odor complaints over time w/o outliers", y = 'Number of odor complaints', x = 'time') +
   scale_y_continuous(limits = c(0, 30)) +  
   theme_minimal() +
   theme(axis.text.x = element_text(angle = 45))

ggplotly(odor_graph_b) %>% layout(dragmode = 'pan')
saveRDS(full_data, 'data/full_data.rds')
saveRDS(daily_full, 'data/daily_full.rds')
remove(la_county, la_county_present, la_county_trans, monitor_odor, monitor_zip,
       monitor_zip_graph, monitors, monitor_coord, odor, odor_graph, odor_graph_b,
       pnts_sf, pnts_trans, elevation, evi)
## Warning in remove(la_county, la_county_present, la_county_trans, monitor_odor,
## : object 'monitor_coord' not found
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   4226784  225.8    7609004  406.4   6079797  324.7
## Vcells 331324847 2527.9  812609506 6199.8 812124887 6196.1

Other issues